BigQuery GoogleAnalytics SQL
Predict Visitor Purchases with a Classification Model in BQMLの記録用 BigQuery ML クーポンもらったので、無料でできた。
自分なりの解釈をつけていく。
このセッションの目的は、あるユーザーの将来の購入見込み率を算定すること。
kaggleの competitionでも同じ目標(objective)だった kaggleの場合は、当然もっと複雑だけど、、ここの考え方は参考になる。
時系列のモデル作成・評価は、windowサイズ、データの期間の設定が難しそう。
基本のCV算出
日別の形にするには?
code: cv.sql
WITH visitors AS(
SELECT
date,
COUNT(DISTINCT fullVisitorId) AS total_visitors
FROM data-to-insights.ecommerce.web_analytics
group by date
),
purchasers AS(
SELECT
date,
COUNT(DISTINCT fullVisitorId) AS total_purchasers
FROM data-to-insights.ecommerce.web_analytics
WHERE totals.transactions IS NOT NULL
group by date
)
SELECT
visitors.date,
total_visitors,
total_purchasers,
round(total_purchasers / total_visitors*100 ,1) AS conversion_rate
FROM visitors
join purchasers on visitors.date = purchasers.date
order by 1
top5 selling products,
以下のSQLになってるけど、Google Analyticsでは、cart や detail段階でも、productQuantityは数字があるので、たぶん、間違ってると思う。以下のどちらかを条件付けするのがよいはず。
where p.localProductRevenue is not null
where h.eCommerceAction.action_type = 6
code: top5_products.sql
SELECT
p.v2ProductName,
p.v2ProductCategory,
SUM(p.productQuantity) AS units_sold,
ROUND(SUM(p.localProductRevenue/1000000),2) AS revenue
FROM data-to-insights.ecommerce.web_analytics,
UNNEST(hits) AS h,
UNNEST(h.product) AS p
GROUP BY 1, 2
ORDER BY revenue DESC
LIMIT 5;
再帰セッションで買ってるユーザーの特定
再帰で購入がある場合が1, 他は0とラベル作り
このあとで、ここに特徴量を追加して、trainging, testデータ・セットにする形
code: return_and_purchased.sql
# visitors who bought on a return visit (could have bought on first as well
WITH all_visitor_stats AS (
SELECT
fullvisitorid, # 741,721 unique visitors
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid
)
SELECT
COUNT(DISTINCT fullvisitorid) AS total_visitors,
will_buy_on_return_visit
FROM all_visitor_stats
GROUP BY will_buy_on_return_visit
初回セッション時の bounce, timeOnSiteを特徴量にして、その後に購入があったかどうか?
code: create_dataset.sql
SELECT
* EXCEPT(fullVisitorId)
FROM
# features
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
data-to-insights.ecommerce.web_analytics
WHERE
totals.newVisits = 1)
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid)
USING (fullVisitorId)
ORDER BY time_on_site DESC
LIMIT 10;
データセットの形ができたので、modelを作る. 特徴量は2つ
optionsで、model_type, labelsを指定
as 以下で、label, 特徴量のデータを作る
trainの期間とevalの期間を分離する。predictの期間は..predictを何に使うかによるかな。
code: create_model.sql
CREATE OR REPLACE MODEL ecommerce.classification_model
OPTIONS
(
model_type='logistic_reg',
)
AS
SELECT
* EXCEPT(fullVisitorId)
FROM
# features
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
data-to-insights.ecommerce.web_analytics
WHERE
totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid)
USING (fullVisitorId)
;
別の期間で評価する。
code: eval.sql
SELECT
roc_auc,
CASE
WHEN roc_auc > .9 THEN 'good'
WHEN roc_auc > .8 THEN 'fair'
WHEN roc_auc > .7 THEN 'decent'
WHEN roc_auc > .6 THEN 'not great'
ELSE 'poor' END AS model_quality
FROM
ML.EVALUATE(MODEL ecommerce.classification_model, (
SELECT
* EXCEPT(fullVisitorId)
FROM
# features
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
data-to-insights.ecommerce.web_analytics
WHERE
totals.newVisits = 1
AND date BETWEEN '20170501' AND '20170630') # eval on 2 months
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid)
USING (fullVisitorId)
));
予測する。
code: predict.sql
SELECT
*
FROM
ml.PREDICT(MODEL ecommerce.classification_model,
(
WITH all_visitor_stats AS (
SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid
)
SELECT
CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
# behavior on the site
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site,
FROM data-to-insights.ecommerce.web_analytics,
JOIN all_visitor_stats USING(fullvisitorid)
WHERE
# only predict for new visits
totals.newVisits = 1
AND date BETWEEN '20170701' AND '20170801' # test 1 month
GROUP BY
unique_session_id,
will_buy_on_return_visit,
bounces,
time_on_site,
)
)
ORDER BY
predicted_will_buy_on_return_visit DESC;
これで、予測値から上位N%を見た場合に、どれくらいliftがあるかをみる。
実際のqwiklabsでは、もう少し特徴量を増やしたものをやる(そのセッションで ecommerce_actionがどのstepまでいったかどうか)